KIDNEY RENAL CLEAR-CELL-CARCINOMA EXPRESSION ANALYSE

Andreu Bofill Pumarola (andreu.bofill01@estudiant.upf.edu)
Sergio Castillo Lara (sergio.castillo01@estudiant.upf.edu)
Adrià Pérez Culubret (adria.perez06@estudiant.upf.edu)

LOADING DATA

library(SummarizedExperiment)
## Loading required package: GenomicRanges
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     grep, grepl, intersect, is.unsorted, lapply, lengths, Map,
##     mapply, match, mget, order, paste, pmax, pmax.int, pmin,
##     pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
##     setdiff, sort, table, tapply, union, unique, unlist, unsplit
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
library(limma)
## 
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
library(edgeR)
library(sva)
## Loading required package: mgcv
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:IRanges':
## 
##     collapse
## This is mgcv 1.8-12. For overview type 'help("mgcv-package")'.
## Loading required package: genefilter
## 
## Attaching package: 'genefilter'
## The following object is masked from 'package:base':
## 
##     anyNA
library(ggplot2)
library(geneplotter)
## Loading required package: lattice
## Loading required package: annotate
## Loading required package: AnnotationDbi
## Loading required package: XML
library(org.Hs.eg.db)
## Loading required package: DBI
## 
library(GOstats)
## Loading required package: Category
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:IRanges':
## 
##     expand
## Loading required package: GO.db
## 
## Loading required package: graph
## 
## Attaching package: 'graph'
## The following object is masked from 'package:XML':
## 
##     addNode
## 
## Attaching package: 'GOstats'
## The following object is masked from 'package:AnnotationDbi':
## 
##     makeGOGraph
library(biomaRt)
library(KEGG.db)
## 
## KEGG.db contains mappings based on older data because the original
##   resource was removed from the the public domain before the most
##   recent update was produced. This package should now be
##   considered deprecated and future versions of Bioconductor may
##   not have it available.  Users who want more current data are
##   encouraged to look at the KEGGREST or reactome.db packages
library(e1071)
library(GSEABase)
library(GSVAdata)
## Loading required package: hgu95a.db
## 

We load the raw RNA-seq data counts set of Kidney renal clear-cell-carcinoma.

se <- readRDS(file.path("~/Documentos/MASTER/IEO/KIDNEY/seKIRC.rds"))


se
## class: RangedSummarizedExperiment 
## dim: 20115 614 
## metadata(5): experimentData annotation cancerTypeCode
##   cancerTypeDescription objectCreationDate
## assays(1): counts
## rownames(20115): 1 2 ... 102724473 103091865
## rowRanges metadata column names(3): symbol txlen txgc
## colnames(614): TCGA.3Z.A93Z.01A.11R.A37O.07
##   TCGA.6D.AA2E.01A.11R.A37O.07 ... TCGA.CZ.5988.11A.01R.1672.07
##   TCGA.CZ.5989.11A.01R.1672.07
## colData names(549): type bcr_patient_uuid ...
##   lymph_nodes_aortic_pos_by_ihc lymph_nodes_aortic_pos_total
table(data.frame(TYPE=se$type, GENDER=se$gender))
##         GENDER
## TYPE     FEMALE MALE
##   normal     20   52
##   tumor     187  337

Exploring phenotypic data

We take a look to the column data, which include the phenotipic information of all samples of the sumarized experiment object.

dim(colData(se))
## [1] 614 549
colData(se)[1:5, 1:5]
## DataFrame with 5 rows and 5 columns
##                                  type                     bcr_patient_uuid
##                              <factor>                             <factor>
## TCGA.3Z.A93Z.01A.11R.A37O.07    tumor 2B1DEA0A-6D55-4FDD-9C1C-0D9FBE03BD78
## TCGA.6D.AA2E.01A.11R.A37O.07    tumor D3B47E53-6F40-4FC8-B5A4-CBE548A770A9
## TCGA.A3.3306.01A.01R.0864.07    tumor 9fb55e0b-43d8-40a3-8ef2-d198e6290551
## TCGA.A3.3307.01A.01R.0864.07    tumor 7ac1d6c6-9ade-49af-8794-10b5b96b2b05
## TCGA.A3.3308.01A.02R.1325.07    tumor 3cbca837-f5a7-4a87-8f02-c59eac232d5a
##                              bcr_patient_barcode form_completion_date
##                                         <factor>             <factor>
## TCGA.3Z.A93Z.01A.11R.A37O.07        TCGA-3Z-A93Z           2014-11-11
## TCGA.6D.AA2E.01A.11R.A37O.07        TCGA-6D-AA2E            2014-3-17
## TCGA.A3.3306.01A.01R.0864.07        TCGA-A3-3306            2010-8-23
## TCGA.A3.3307.01A.01R.0864.07        TCGA-A3-3307            2010-4-13
## TCGA.A3.3308.01A.02R.1325.07        TCGA-A3-3308            2010-4-12
##                              prospective_collection
##                                            <factor>
## TCGA.3Z.A93Z.01A.11R.A37O.07                    YES
## TCGA.6D.AA2E.01A.11R.A37O.07                    YES
## TCGA.A3.3306.01A.01R.0864.07                     NO
## TCGA.A3.3307.01A.01R.0864.07                     NO
## TCGA.A3.3308.01A.02R.1325.07                     NO

We also need to observe to the metadata information content of these data.

mcols(colData(se), use.names=TRUE)
## DataFrame with 549 rows and 2 columns
##                                                          labelDescription
##                                                               <character>
## type                                           sample type (tumor/normal)
## bcr_patient_uuid                                         bcr patient uuid
## bcr_patient_barcode                                   bcr patient barcode
## form_completion_date                                 form completion date
## prospective_collection            tissue prospective collection indicator
## ...                                                                   ...
## lymph_nodes_pelvic_pos_total                               total pelv lnp
## lymph_nodes_aortic_examined_count                           total aor lnr
## lymph_nodes_aortic_pos_by_he                          aln pos light micro
## lymph_nodes_aortic_pos_by_ihc                                 aln pos ihc
## lymph_nodes_aortic_pos_total                                total aor-lnp
##                                         CDEID
##                                   <character>
## type                                       NA
## bcr_patient_uuid                           NA
## bcr_patient_barcode                   2673794
## form_completion_date                       NA
## prospective_collection                3088492
## ...                                       ...
## lymph_nodes_pelvic_pos_total          3151828
## lymph_nodes_aortic_examined_count     3104460
## lymph_nodes_aortic_pos_by_he          3151832
## lymph_nodes_aortic_pos_by_ihc         3151831
## lymph_nodes_aortic_pos_total          3151827

As we can see there are three different columns on the metadata result. The first column is the name of all the variables; the second is the labelDescription, which is the definition of each variable; and the last one corresponding to the Common Data Element Identifier (CDEID).

Exploring feature data

We look through the data rows, which are the feature elements, to see the genes information of our data set.

rowRanges(se)
## GRanges object with 20115 ranges and 3 metadata columns:
##             seqnames               ranges strand   |      symbol     txlen
##                <Rle>            <IRanges>  <Rle>   | <character> <integer>
##           1    chr19 [58345178, 58362751]      -   |        A1BG      3322
##           2    chr12 [ 9067664,  9116229]      -   |         A2M      4844
##           9     chr8 [18170477, 18223689]      +   |        NAT1      2280
##          10     chr8 [18391245, 18401218]      +   |        NAT2      1322
##          12    chr14 [94592058, 94624646]      +   |    SERPINA3      3067
##         ...      ...                  ...    ... ...         ...       ...
##   100996331    chr15 [20835372, 21877298]      -   |       POTEB      1706
##   101340251    chr17 [40027542, 40027645]      -   |    SNORD124       104
##   101340252     chr9 [33934296, 33934376]      -   |   SNORD121B        81
##   102724473     chrX [49303669, 49319844]      +   |      GAGE10       538
##   103091865    chr21 [39313935, 39314962]      +   |   BRWD1-IT2      1028
##                  txgc
##             <numeric>
##           1 0.5644190
##           2 0.4882329
##           9 0.3942982
##          10 0.3895613
##          12 0.5249429
##         ...       ...
##   100996331 0.4308324
##   101340251 0.4903846
##   101340252 0.4074074
##   102724473 0.5055762
##   103091865 0.5924125
##   -------
##   seqinfo: 455 sequences (1 circular) from hg38 genome

DGE object

We create an object to hold the dataset to be analysed in a better and more comprehensive way.

dge <- DGEList(counts=assays(se)$counts, genes=mcols(se))
## Warning in as.data.frame.DataFrame(genes, stringsAsFactors = FALSE):
## Arguments in '...' ignored

Moreover, we calculate \(\log_2\) CPM values of expression and we save them in an assay element.

assays(se)$logCPM <- cpm(dge, log=TRUE, prior.count=0.5)

QUALITY AND NORMALIZATION

Library size filtering

At this point, we need to determine if the library sizes of tumor samples and normal samples are similar or not.

libsize <- data.frame(libsize = dge$sample$lib.size/1e6, type = se$type)

summary(libsize)
##     libsize            type    
##  Min.   :  3.049   normal: 72  
##  1st Qu.: 38.075   tumor :542  
##  Median : 46.715               
##  Mean   : 46.679               
##  3rd Qu.: 54.647               
##  Max.   :115.546
Figure S1: Density distribution of library size.

Figure S1: Density distribution of library size.

This past figure shows the density distribution on sequencing depth (milions of reads).

Figure S2: Histogram of library size distribution.

Figure S2: Histogram of library size distribution.

In this other figure, we can see now the proportion of samples in each sequencing depth (Milions of Reads).

As we can see, both figures show that the normal and tumor samples are similar in terms of sequencing depth, except for some samples that show extreme values.

Moreover, we can see how both normal and tumor types, follow a normal distribution through the sequencing depth.

At this point, we can filter out those samples with a low sequencing depth (<45 Milions of reads) because they are less reliable.

dge.filt <- dge[, dge$sample$lib.size/1e6 > 45 ]

final.samples <- rownames(dge.filt$samples)

se.filt <-se[,colnames(se) %in% final.samples]
se.normal <- se.filt[,se.filt$type == "normal"]
se.tumor  <- se.filt[,se.filt$type == "tumor"]


normal.code <- substr(colnames(se.normal), 9, 12)
tumor.code <- substr(colnames(se.tumor), 9, 12)

common.codes <- intersect(normal.code, tumor.code)
length(common.codes)
## [1] 38
se.paired <- se.filt[,substr(colnames(se.filt), 9, 12) %in% common.codes]
summary(se.paired$type)
## normal  tumor 
##     38     38
colData(se.paired)$samplecodes <- substr(colnames(se.paired), 9, 12)
dge.filt <- DGEList(counts=assays(se.paired)$counts, genes=mcols(se.paired))
## Warning in as.data.frame.DataFrame(genes, stringsAsFactors = FALSE):
## Arguments in '...' ignored
Figure S3: Library sizes in increasing order.

Figure S3: Library sizes in increasing order.

Genes filtering: Distribution of expression levels

par(mfrow=c(1,2), mar=c(4,5,1,1))

Expression levels comparison between all samples

In the next part, we take a look on the distribution of expression values per sample in terms of logarithmic CPM units.

Figure S4: Multidensity plot of log2CPM.

Figure S4: Multidensity plot of log2CPM.

We cannot observe significant diferences between samples in terms of expression levels.

As we have more than 200 samples, we now display the normal and tumor distribution separately.

Expression levels comparison between normal and tumor

normalCPM <- se.paired[,se.paired$type == "normal"]
tumorCPM <- se.paired[,se.paired$type == "tumor"]
Figure S5: Multidensity plot of normal samples log2CPM.

Figure S5: Multidensity plot of normal samples log2CPM.

Figure S6: Multidensity plot of tumor samples log2CPM.

Figure S6: Multidensity plot of tumor samples log2CPM.

Distribution of expression levels among genes

Now, we want to filter out the genes with a low expression. To do that, we can plot the CPMs in logarithmic scale.

genemean <- data.frame(Means=aveLogCPM(assays(se.paired)$counts))
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
Figure S7: Distribution of expression levels among genes.

Figure S7: Distribution of expression levels among genes.

We can see how there are two peaks of genes. The first one correspond to these genes with a very low expression and thus we decide to remove them. In other to do that, we used a minimum average of 1 LogCPM unit as a cutoff to filter out lowly expressed genes.

avgexp <- aveLogCPM(assays(se.paired)$counts)
mask <- avgexp > 1

These are the numbers of genes before filtering:

dim(se.paired) # Before filtering SE
## [1] 20115    76
dim(dge.filt) # Before filtering DGE
## [1] 20115    76

These are the numbers of samples genes after filtering:

se.paired.genes <- se.paired[mask, ]
dim(se.paired.genes) # After filtering SE
## [1] 12495    76
dge.filt <- dge.filt[mask, ]
dim(dge.filt) # After filtering DGE
## [1] 12495    76

At this point, we can calculate the normaliation factors on the filtered expression data set.

dgenorm <- calcNormFactors(dge.filt)
assays(se.paired.genes)$logCPMnorm <- cpm(dgenorm, log=TRUE, prior.count=0.5)

MA-plots

We look at the MA-plots of the normal samples to see if there are systematic biases in gene expression levels in any of the sampes. We expect the slope to be around zero.

Figure S8: MA-plots of normal samples.

Figure S8: MA-plots of normal samples.

Now, we examine the tumor samples:

Figure S9: MA-plots of tumor samples.

Figure S9: MA-plots of tumor samples.

After observing them, one normal sample has been identified with a biased gene expression, so we take it out of our dataset. This sample is the TGCA-CW-5591.

dim(se.paired.genes) # Before filtering SE
## [1] 12495    76
dim(dge.filt) # Before filtering DGE
## [1] 12495    76
se.paired.genes <- se.paired.genes[,!se.paired.genes$bcr_patient_barcode %in% "TCGA-CW-5591"]
dim(se.paired.genes) # After filtering SE
## [1] 12495    74
dge.filt <- dge.filt[, !substr(rownames(dge.filt$samples), 1, 12) %in% "TCGA.CW.5591"]
dim(dge.filt) # After filtering DGE
## [1] 12495    74
dgenorm <- dgenorm[, !substr(rownames(dgenorm$samples), 1, 12) %in% "TCGA.CW.5591"]
dim(dgenorm) # After filtering DGE
## [1] 12495    74

As we can see, we don’t see any tumor samples with major biases in gene expression

Batch identification

Now we’re going to analyze our data in order to search for batch effect that could interfere with the biological signal. First, we analyze some of the information contained in the barcode, such as tissue source site, center of sequenciation, plate and portion and analyte combinations. We use two approaches to try to identify batch effect, the Hierarchical Clustering and a Multidimensional Scaling plot.

tss <- substr(colnames(se.paired.genes), 6, 7)
table(tss)
## tss
## B0 B2 B8 CJ CW CZ 
## 14  2  4  4 14 36
center <- substr(colnames(se.paired.genes), 27, 28)
table(center)
## center
## 07 
## 74
plate <- substr(colnames(se.paired.genes), 22, 25)
table(plate)
## plate
## 1503 1541 1672 
##   38   18   18
portionanalyte <- substr(colnames(se.paired.genes), 18, 20)
table(portionanalyte)
## portionanalyte
## 01R 02R 11R 
##  58   4  12
samplevial <- substr(colnames(se.paired.genes), 14, 16)
table(samplevial)
## samplevial
## 01A 01B 11A 
##  36   1  37

GENDER batch effect

table(data.frame(TYPE=se.paired.genes$type, GENDER=se.paired.genes$gender))
##         GENDER
## TYPE     FEMALE MALE
##   normal     13   24
##   tumor      13   24
#logCPM <- cpm(dgenorm, log=TRUE, prior.count=3)
d <- as.dist(1-cor(assays(se.paired.genes)$logCPMnorm, method="spearman"))
sampleClustering <- hclust(d)
batch <- as.integer(factor(se.paired.genes$gender))
sampleDendrogram <- as.dendrogram(sampleClustering, hang=0.1)
names(batch) <- colnames(se.paired.genes)
outcome <- paste(substr(colnames(se.paired.genes), 9, 12), as.character(se.paired.genes$type), sep="-")
names(outcome) <- colnames(se.paired.genes)
sampleDendrogram <- dendrapply(sampleDendrogram,
                               function(x, batch, labels) {
                                 if (is.leaf(x)) {
                                   attr(x, "nodePar") <- list(lab.col=as.vector(batch[attr(x, "label")]))
                                   attr(x, "label") <- as.vector(labels[attr(x, "label")])
                                 }
                                 x
                               }, batch, outcome)
plot(sampleDendrogram, main="Hierarchical clustering of samples")
legend("topright", paste("TSS", sort(unique(batch)), levels(factor(se.paired.genes$gender))), fill=sort(unique(batch)))
Figure S10: Hierarchical clustering of samples.

Figure S10: Hierarchical clustering of samples.

plotMDS(dgenorm, labels=outcome, col=batch)
legend("bottomleft", paste("Gender", sort(unique(batch)), levels(factor(se.paired.genes$gender))),
       fill=sort(unique(batch)), inset=0.05)
Figure S11: Multidimensional scaling plot of samples separated by gender.

Figure S11: Multidimensional scaling plot of samples separated by gender.

TSS batch effect

table(data.frame(TYPE=se.paired.genes$type, TSS=tss))
##         TSS
## TYPE     B0 B2 B8 CJ CW CZ
##   normal  7  1  2  2  7 18
##   tumor   7  1  2  2  7 18
#logCPM <- cpm(dgenorm, log=TRUE, prior.count=3)
d <- as.dist(1-cor(assays(se.paired.genes)$logCPMnorm, method="spearman"))
sampleClustering <- hclust(d)
batch <- as.integer(factor(tss))
sampleDendrogram <- as.dendrogram(sampleClustering, hang=0.1)
names(batch) <- colnames(se.paired.genes)
outcome <- paste(substr(colnames(se.paired.genes), 9, 12), as.character(se.paired.genes$type), sep="-")
names(outcome) <- colnames(se.paired.genes)
sampleDendrogram <- dendrapply(sampleDendrogram,
                               function(x, batch, labels) {
                                 if (is.leaf(x)) {
                                   attr(x, "nodePar") <- list(lab.col=as.vector(batch[attr(x, "label")]))
                                   attr(x, "label") <- as.vector(labels[attr(x, "label")])
                                 }
                                 x
                               }, batch, outcome)
plot(sampleDendrogram, main="Hierarchical clustering of samples")
legend("topright", paste("TSS", sort(unique(batch)), levels(factor(tss))), fill=sort(unique(batch)))
Figure S12: Hierarchical clustering of samples separated by TSS.

Figure S12: Hierarchical clustering of samples separated by TSS.

plotMDS(dgenorm, labels=outcome, col=batch)
legend("bottomleft", paste("TSS", sort(unique(batch)), levels(factor(tss))),
       fill=sort(unique(batch)), inset=0.05)
Figure S13: Multidimensional scaling plot of samples separated by TSS.

Figure S13: Multidimensional scaling plot of samples separated by TSS.

PLATE batch effect

table(data.frame(TYPE=se.paired.genes$type, PLATE=plate))
##         PLATE
## TYPE     1503 1541 1672
##   normal   19    9    9
##   tumor    19    9    9
#logCPM <- cpm(dgenorm, log=TRUE, prior.count=3)
d <- as.dist(1-cor(assays(se.paired.genes)$logCPMnorm, method="spearman"))
sampleClustering <- hclust(d)
batch <- as.integer(factor(plate))
sampleDendrogram <- as.dendrogram(sampleClustering, hang=0.1)
names(batch) <- colnames(se.paired.genes)
outcome <- paste(substr(colnames(se.paired.genes), 9, 12), as.character(se.paired.genes$type), sep="-")
names(outcome) <- colnames(se.paired.genes)
sampleDendrogram <- dendrapply(sampleDendrogram,
                               function(x, batch, labels) {
                                 if (is.leaf(x)) {
                                   attr(x, "nodePar") <- list(lab.col=as.vector(batch[attr(x, "label")]))
                                   attr(x, "label") <- as.vector(labels[attr(x, "label")])
                                 }
                                 x
                               }, batch, outcome)
plot(sampleDendrogram, main="Hierarchical clustering of samples")
legend("topright", paste("PLATE", sort(unique(batch)), levels(factor(plate))), fill=sort(unique(batch)))
Figure S14: Hierarchical clustering of the samples separated by PLATE.

Figure S14: Hierarchical clustering of the samples separated by PLATE.

plotMDS(dgenorm, labels=outcome, col=batch)
legend("bottomleft", paste("plate", sort(unique(batch)), levels(factor(plate))),
       fill=sort(unique(batch)), inset=0.05)
Figure S15: Multidimensional scaling plot of the samples separated by PLATE.

Figure S15: Multidimensional scaling plot of the samples separated by PLATE.

PORTION-ANALYTE batch effect

table(data.frame(TYPE=se.paired.genes$type, P.analyte = portionanalyte))
##         P.analyte
## TYPE     01R 02R 11R
##   normal  35   2   0
##   tumor   23   2  12
#logCPM <- cpm(dgenorm, log=TRUE, prior.count=3)
d <- as.dist(1-cor(assays(se.paired.genes)$logCPMnorm, method="spearman"))
sampleClustering <- hclust(d)
batch <- as.integer(factor(portionanalyte))
sampleDendrogram <- as.dendrogram(sampleClustering, hang=0.1)
names(batch) <- colnames(se.paired.genes)
outcome <- paste(substr(colnames(se.paired.genes), 9, 12), as.character(se.paired.genes$type), sep="-")
names(outcome) <- colnames(se.paired.genes)
sampleDendrogram <- dendrapply(sampleDendrogram,
                               function(x, batch, labels) {
                                 if (is.leaf(x)) {
                                   attr(x, "nodePar") <- list(lab.col=as.vector(batch[attr(x, "label")]))
                                   attr(x, "label") <- as.vector(labels[attr(x, "label")])
                                 }
                                 x
                               }, batch, outcome)
plot(sampleDendrogram, main="Hierarchical clustering of samples")
legend("topright", paste("Portionanalyte", sort(unique(batch)), levels(factor(portionanalyte))), fill=sort(unique(batch)))
Figure S16: Hierarchical clustering of the samples separated by PortionAnalyte.

Figure S16: Hierarchical clustering of the samples separated by PortionAnalyte.

plotMDS(dgenorm, labels=outcome, col=batch)
legend("bottomleft", paste("Portionanalyte", sort(unique(batch)), levels(factor(portionanalyte))),
       fill=sort(unique(batch)), inset=0.05)
Figure S17: Multidimensional scaling plot of samples separated by PortionAnalyte.

Figure S17: Multidimensional scaling plot of samples separated by PortionAnalyte.

The Hierarchical clustering and the Multidimensional Scaling plots show a clear differentiation between normal samples and tumor samples. All four plots from the different elements of the barcode have a similar clustering and distribution. There is no clear effect from the batch indicators in the clustering, and it seems to have a balanced design (more clearly in tumor than in normal), although it seems that 9 tumor samples cluster apart from the other samples in the hierarchical clustering, so we should consider taking apart this samples.

DIFFERENTIAL EXPRESSION ANALYSIS

In order to identify differentially expressed genes, we used a linear model with two factors: one being the cell type (tumor or normal) and the other being the individual/sample.

We chose a False Discovery Rate of 0.01 (1%). In order to asses the DE genes, we applied a Surrogate Variable Analysis (sva) and we corrected the possible biases of the data estimating the mean-variance relationship using the voom function. The p-values were adjusted using the FDR.

Mean variance relationship

design <- model.matrix(~type + samplecodes, data = colData(se.paired.genes))
v <- voom(dgenorm, design, plot=FALSE)

FDRcutoff <- 0.01
mod0 <- model.matrix(~samplecodes, colData(se.paired.genes))
sv <- sva(v$E, mod = design, mod0 = mod0)
## Number of significant surrogate variables is:  7 
## Iteration (out of 5 ):1  2  3  4  5
design.voom <- ""
design.voom <- cbind(design, sv$sv)


fit <- lmFit(v, design.voom)
fit <- eBayes(fit)
res <- decideTests(fit, p.value = FDRcutoff)
Figure S18: Volcano plot of DE analysis.

Figure S18: Volcano plot of DE analysis.

toptable <- topTable(fit, coef = 2, n=Inf)
Figure S19: Pvalue distribution and qqplot.

Figure S19: Pvalue distribution and qqplot.

As we can see in these two previous plots, we obtained many genes that are considered to be differentially expressed between tumor and normal samples (using an FDR of 0.01).

Over vs under

We ploted the Fold Change distribution in order to asses if this variable could be used in order to further reduce the number of differentially expressed genes. This could potentially give us a narrower list of genes that may be related to the development of the Kidney clear cell carcinoma.

Figure S20: Plot of logFC distribution.

Figure S20: Plot of logFC distribution.

ks.test(x=toptable$logFC,y='pnorm',alternative='two.sided')
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  toptable$logFC
## D = 0.09324, p-value < 2.2e-16
## alternative hypothesis: two-sided

We decided to use a Fold Change threshold of +5 and -5, to reduce the list of DE genes to only a handful of genes. This list is stored in the variables (under_exp and over_exp). We also divided the list of genes in two groups: those with a positive fold change and those with a negative one. This list will be used for the Functional enrichment analysis.

under_exp <- toptable[toptable$logFC <= -5 ,]
over_exp  <- toptable[toptable$logFC >= 5,] 

under_exp.all <- toptable[toptable$logFC <= 0 ,]
over_exp.all  <- toptable[toptable$logFC >= 0,] 

test_overexp <- toptable[toptable$logFC >= 3,]  

dim(under_exp)
## [1] 153   9
dim(over_exp)
## [1] 46  9
head(over_exp[order(over_exp$logFC, decreasing = TRUE),])
##        symbol txlen      txgc     logFC    AveExpr        t      P.Value
## 2184      FAH  3427 0.5170703 10.115493  0.8164831 16.72456 1.562600e-17
## 81617  CAB39L  3692 0.4320152  9.104303  3.4604998 22.65289 1.746516e-21
## 389320  TEX43   464 0.4030172  8.527648 -1.9867210 17.35212 5.303748e-18
## 973     CD79A  1258 0.6073132  8.336017  0.9896687 26.65775 1.142480e-23
## 5965    RECQL  3702 0.3725014  7.984984  2.0659290 17.20022 6.869086e-18
## 401387  LRRD1  2830 0.3395760  7.622375 -0.7109079 18.74206 5.402535e-19
##           adj.P.Val        B
## 2184   2.532386e-16 29.46327
## 81617  1.536811e-19 38.72011
## 389320 1.022690e-16 29.64651
## 973    2.833658e-21 42.55773
## 5965   1.275323e-16 30.74923
## 401387 1.480366e-17 32.07925
head(under_exp[order(under_exp$logFC, decreasing = FALSE),])
##         symbol txlen      txgc     logFC  AveExpr         t      P.Value
## 400713  ZNF880  2230 0.3995516 -12.46608 6.106883 -22.48152 2.203720e-21
## 361       AQP4  5274 0.3795980 -12.09207 5.307332 -22.84434 1.349424e-21
## 125875  CLDND2   974 0.6190965 -11.38629 1.464706 -24.75417 1.138355e-22
## 2592      GALT  2566 0.5420889 -10.99625 2.085865 -24.36734 1.852009e-22
## 8649   LAMTOR3  4226 0.3774255 -10.96988 3.871072 -22.36824 2.572036e-21
## 391196   OR2M7   939 0.4675186 -10.65841 1.105295 -27.39939 4.857318e-24
##           adj.P.Val        B
## 400713 1.835699e-19 38.66244
## 361    1.286424e-19 38.94593
## 125875 1.725189e-20 39.02897
## 2592   2.461793e-20 39.31175
## 8649   2.034024e-19 38.14609
## 391196 1.619172e-21 41.34597

Gene set enrichment analysis

We performed a Gene set enrichment analysis. We used the data set called c2BroadSets from the GSVAdata, to obtain the different gene sets, restricting the pathways to the ones from KEGG, REACTOME and BIOCARTA.

data(c2BroadSets)

# Filtering to only KEGG, Reactome and BioCarta
gsc <- c2BroadSets[c(grep("^KEGG", names(c2BroadSets)),
               grep("^REACTOME", names(c2BroadSets)), grep("^BIOCARTA", names(c2BroadSets)))]

gsc <- mapIdentifiers(gsc, AnnoOrEntrezIdentifier(metadata(se.paired.genes)$annotation))
gsmatrix <- incidence(gsc)
dim(gsmatrix)
## [1]  833 6744

We remove the genes from the gene sets that are not present in our data, and we also discard the genes in our data that are not present in the gene sets.

# Discard genes not in our data and genes not in the gene sets

gsmatrix <- gsmatrix[, colnames(gsmatrix) %in% rownames(se.paired.genes)]
dim(gsmatrix)
## [1]  833 4038
se.paired.gsea <- se.paired.genes[colnames(gsmatrix), ]
dim(se.paired.genes)
## [1] 12495    74
dim(se.paired.gsea)
## [1] 4038   74
dgenorm.gsea <- dgenorm[colnames(gsmatrix), ]
dim(dgenorm)
## [1] 12495    74
dim(dgenorm.gsea)
## [1] 4038   74

We removed the sets with a size inferior to 5.

gsmatrix <- gsmatrix[rowSums(gsmatrix) >= 5, ]
dim(gsmatrix)
## [1]  807 4038
design <- model.matrix(~type + samplecodes, data = colData(se.paired.gsea))

v <- voom(dgenorm, design, plot=FALSE)
FDRcutoff <- 0.01
mod0 <- model.matrix(~samplecodes, colData(se.paired.gsea))
sv <- sva(v$E, mod = design, mod0 = mod0)
## Number of significant surrogate variables is:  7 
## Iteration (out of 5 ):1  2  3  4  5
design.voom <- ""
design.voom <- cbind(design, sv$sv)

fit <- lmFit(v, design.voom)
fit <- eBayes(fit)
toptable.gsea <- topTable(fit, coef = 2, n=Inf)

tGSgenes <- toptable.gsea[match(colnames(gsmatrix), rownames(toptable.gsea)), "t"]
head(tGSgenes)
## [1]  -4.768989  -4.177214   1.941461  12.904205 -11.136535   2.767150
zS <- sqrt(rowSums(gsmatrix)) * (as.vector(gsmatrix %*% tGSgenes)/rowSums(gsmatrix))
length(zS)
## [1] 807
head(zS)
##               KEGG_GLYCOLYSIS_GLUCONEOGENESIS 
##                                    -15.558933 
##                  KEGG_CITRATE_CYCLE_TCA_CYCLE 
##                                     -1.217683 
##                KEGG_PENTOSE_PHOSPHATE_PATHWAY 
##                                     -1.592216 
## KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS 
##                                     -4.944474 
##          KEGG_FRUCTOSE_AND_MANNOSE_METABOLISM 
##                                    -13.367401 
##                     KEGG_GALACTOSE_METABOLISM 
##                                    -14.827441
pv <- pmin(pnorm(zS), 1 - pnorm(zS))
pvadj <- p.adjust(pv, method = "fdr")
DEgs <- names(pvadj)[which(pvadj < 0.01)]
length(DEgs)
## [1] 677
Figure S21: GSEA qqplot of normal samples.

Figure S21: GSEA qqplot of normal samples.

rnkGS <- sort(abs(zS), decreasing = TRUE)
head(rnkGS,27)
##                      KEGG_OXIDATIVE_PHOSPHORYLATION 
##                                            51.25554 
##                       KEGG_PRIMARY_IMMUNODEFICIENCY 
##                                            41.91329 
##        REACTOME_DOWNSTREAM_EVENTS_IN_GPCR_SIGNALING 
##                                            40.84898 
##       REACTOME_SLC_MEDIATED_TRANSMEMBRANE_TRANSPORT 
##                                            39.72385 
##         KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION 
##                                            35.65341 
##                     KEGG_JAK_STAT_SIGNALING_PATHWAY 
##                                            34.85214 
##                          KEGG_FATTY_ACID_METABOLISM 
##                                            33.10837 
##                             KEGG_PATHWAYS_IN_CANCER 
##                                            32.33795 
##             KEGG_VASCULAR_SMOOTH_MUSCLE_CONTRACTION 
##                                            32.31561 
## REACTOME_TRANSMEMBRANE_TRANSPORT_OF_SMALL_MOLECULES 
##                                            32.18810 
##   REACTOME_GLUCOSE_AND_OTHER_SUGAR_SLC_TRANSPORTERS 
##                                            31.77223 
##                          BIOCARTA_CELLCYCLE_PATHWAY 
##                                            31.56679 
##        REACTOME_CHEMOKINE_RECEPTORS_BIND_CHEMOKINES 
##                                            31.51400 
##                REACTOME_G_ALPHA_Q_SIGNALLING_EVENTS 
##                                            31.31659 
##                        BIOCARTA_TCAPOPTOSIS_PATHWAY 
##                                            31.13932 
##                         KEGG_SMALL_CELL_LUNG_CANCER 
##                                            30.89871 
##                     KEGG_HEMATOPOIETIC_CELL_LINEAGE 
##                                            30.09452 
##                            BIOCARTA_THELPER_PATHWAY 
##                                            29.81463 
##                              REACTOME_PD1_SIGNALING 
##                                            29.49595 
##                         BIOCARTA_SALMONELLA_PATHWAY 
##                                            29.44496 
##                        REACTOME_SEROTONIN_RECEPTORS 
##                                            29.40978 
##           REACTOME_PEPTIDE_LIGAND_BINDING_RECEPTORS 
##                                            29.32447 
##                    KEGG_CHEMOKINE_SIGNALING_PATHWAY 
##                                            28.53385 
##                               BIOCARTA_TCRA_PATHWAY 
##                                            28.43250 
##                               BIOCARTA_IL17_PATHWAY 
##                                            28.27425 
##    REACTOME_INORGANIC_CATION_ANION_SLC_TRANSPORTERS 
##                                            28.00338 
##                         KEGG_VEGF_SIGNALING_PATHWAY 
##                                            27.88516
## plot stuff
plotGS <- function(se, gs, pheno, ...) {
    l <- levels(colData(se)[, pheno])
    idxSamples1 <- colData(se)[, pheno] == l[1]
    idxSamples2 <- colData(se)[, pheno] == l[2]
    exps1 <- rowMeans(assays(se)$logCPMnorm[gs, idxSamples1])
    exps2 <- rowMeans(assays(se)$logCPMnorm[gs, idxSamples2])
    rng <- range(c(exps1, exps2))
    plot(exps1, exps2, pch = 21, col = "black", bg = "black", xlim = rng, ylim = rng, 
        xlab = l[1], ylab = l[2], ...)
    abline(a = 0, b = 1, lwd = 2, col = "red")
}

We ploted the logCPMs of tumor and normal samples for the first 27 gene sets sorted by z-score.

Figure S22: Scatter plots of gene sets sorted by z-score.

Figure S22: Scatter plots of gene sets sorted by z-score.

Functional enrichment

We performed a functional enrichment analysis using Biological Process Gene Ontology terms. We performed a Fisher’s Exact Test. We applied this methodology on three gene sets: one with all the differentially expressed genes (using an adjusted P-value of 0.01), another one with only over-expressed genes and finally one with only under-expressed genes (both using an adj. P-value threshold of 0.01).

Over represented GOs (ALL)

DEgenes <- rownames(toptable)[toptable$adj.P.Val < 0.01]
length(DEgenes)
## [1] 9108
geneUniverse <- rownames(se.paired.genes)
params <- new("GOHyperGParams", geneIds=DEgenes, universeGeneIds=geneUniverse,
              annotation="org.Hs.eg.db", ontology="BP",
              pvalueCutoff=0.01, testDirection="over")

conditional(params) <- TRUE
hgOver <- hyperGTest(params)
GOresults <- summary(hgOver)
GOresults <- GOresults[GOresults$Size >= 5 & GOresults$Count >= 5,]

GOresults <- GOresults[order(GOresults$OddsRatio, decreasing = TRUE), ]
GOresults <- GOresults[order(GOresults$Pvalue, decreasing = FALSE), ]

head(GOresults)
##       GOBPID       Pvalue OddsRatio ExpCount Count Size
## 1 GO:0051301 0.0002916266  1.538802 293.3672   323  402
## 2 GO:0002768 0.0009905728  1.563713 222.5796   246  305
## 3 GO:0019932 0.0010002937  2.102163  91.9509   107  126
## 4 GO:0071774 0.0010613786  1.689018 164.9278   185  226
## 5 GO:0060429 0.0011365753  1.354855 452.4568   485  620
## 6 GO:0071363 0.0012685419  1.379690 394.8051   425  541
##                                                                 Term
## 1                                                      cell division
## 2 immune response-regulating cell surface receptor signaling pathway
## 3                                second-messenger-mediated signaling
## 4                               response to fibroblast growth factor
## 5                                             epithelium development
## 6                        cellular response to growth factor stimulus
htmlReport(hgOver, file = "gotests.html")

Over represented GOs (Overexpressed DE)

DEgenes <- rownames(over_exp.all)[over_exp.all$adj.P.Val < 0.01]
length(DEgenes)
## [1] 5094
geneUniverse <- rownames(se.paired.genes)
params <- new("GOHyperGParams", geneIds=DEgenes, universeGeneIds=geneUniverse,
              annotation="org.Hs.eg.db", ontology="BP",
              pvalueCutoff=0.01, testDirection="over")

conditional(params) <- TRUE
hgOver <- hyperGTest(params)
GOresultsover <- summary(hgOver)
GOresultsover <- GOresultsover[GOresultsover$Size >= 5 & GOresultsover$Count >= 5,]

GOresultsover <- GOresultsover[order(GOresultsover$OddsRatio, decreasing = TRUE), ]
GOresultsover <- GOresultsover[order(GOresultsover$Pvalue, decreasing = FALSE), ]

head(GOresultsover)
##       GOBPID       Pvalue OddsRatio  ExpCount Count Size
## 1 GO:0051301 1.768797e-05  1.533603 165.24901   206  402
## 2 GO:0051251 3.038639e-05  2.063637  53.84980    77  131
## 3 GO:0007156 3.677431e-05  2.469144  34.52964    53   84
## 4 GO:0008283 8.589732e-05  1.275001 459.16206   518 1117
## 5 GO:0032946 1.057712e-04  2.633582  26.71937    42   65
## 6 GO:0050867 1.128832e-04  1.883097  59.60474    82  145
##                                                              Term
## 1                                                   cell division
## 2                    positive regulation of lymphocyte activation
## 3 homophilic cell adhesion via plasma membrane adhesion molecules
## 4                                              cell proliferation
## 5           positive regulation of mononuclear cell proliferation
## 6                          positive regulation of cell activation
htmlReport(hgOver, file = "gotestsover.html")

Over represented GOs (Underexpressed DE)

DEgenes <- rownames(under_exp.all)[under_exp.all$adj.P.Val < 0.01]
length(DEgenes)
## [1] 4014
geneUniverse <- rownames(se.paired.genes)
params <- new("GOHyperGParams", geneIds=DEgenes, universeGeneIds=geneUniverse,
              annotation="org.Hs.eg.db", ontology="BP",
              pvalueCutoff=0.01, testDirection="over")

conditional(params) <- TRUE
hgUnder <- hyperGTest(params)
GOresultsunder <- summary(hgUnder)
GOresultsunder <- GOresultsunder[GOresultsunder$Size >= 5 & GOresultsunder$Count >= 5,]

GOresultsunder <- GOresultsunder[order(GOresultsunder$OddsRatio, decreasing = TRUE), ]
GOresultsunder <- GOresultsunder[order(GOresultsunder$Pvalue, decreasing = FALSE), ]

head(GOresultsunder)
##       GOBPID       Pvalue OddsRatio  ExpCount Count Size
## 1 GO:0007210 9.939131e-06 25.749672  4.143125    12   13
## 2 GO:0009145 9.815811e-05  4.534902  8.923653    19   28
## 3 GO:0009201 9.815811e-05  4.534902  8.923653    19   28
## 4 GO:0022904 1.096097e-04  2.731924 18.803412    33   59
## 5 GO:0006754 2.123284e-04  4.294813  8.604951    18   27
## 6 GO:0015991 2.123284e-04  4.294813  8.604951    18   27
##                                                  Term
## 1                serotonin receptor signaling pathway
## 2 purine nucleoside triphosphate biosynthetic process
## 3    ribonucleoside triphosphate biosynthetic process
## 4                respiratory electron transport chain
## 5                            ATP biosynthetic process
## 6             ATP hydrolysis coupled proton transport
htmlReport(hgUnder, file = "gotestsunder.html")

Over represented KEGGs (ALL)

DEgenes <- rownames(toptable)[toptable$adj.P.Val < 0.01]
length(DEgenes)
## [1] 9108
geneUniverse <- rownames(se.paired.genes)
params <- new("KEGGHyperGParams", geneIds=DEgenes, universeGeneIds=geneUniverse,
              annotation="org.Hs.eg.db", pvalueCutoff=0.01, testDirection="over")

hgOver <- hyperGTest(params)

hgOver
## Gene to KEGG  test for over-representation 
## 228 KEGG ids tested (6 have p < 0.01)
## Selected gene set size: 2577 
##     Gene universe size: 3494 
##     Annotation package: org.Hs.eg
KEGGresults <- summary(hgOver)

# Filtering by counts and size
KEGGresults <- KEGGresults[KEGGresults$Size >= 5 & KEGGresults$Count >= 5, ]


KEGGresults <- KEGGresults[order(KEGGresults$OddsRatio, decreasing = TRUE), ]
KEGGresults <- KEGGresults[order(KEGGresults$Pvalue, decreasing = FALSE), ]

head(KEGGresults)
##   KEGGID      Pvalue OddsRatio ExpCount Count Size
## 1  05340 0.001637912       Inf 15.48855    21   21
## 2  04660 0.002350134  3.126865 42.77790    52   58
## 3  05160 0.005940497  2.167665 61.95421    72   84
## 4  00650 0.006277214  8.249021 17.70120    23   24
## 5  00410 0.007574224       Inf 11.80080    16   16
## 6  04144 0.008741059  1.776341 91.45621   103  124
##                                Term
## 1          Primary immunodeficiency
## 2 T cell receptor signaling pathway
## 3                       Hepatitis C
## 4              Butanoate metabolism
## 5           beta-Alanine metabolism
## 6                       Endocytosis
htmlReport(hgOver, file = "kegg.html")

Over represented KEGGs (Over expressed)

DEgenes <- rownames(over_exp.all)[over_exp.all$adj.P.Val < 0.01]
length(DEgenes)
## [1] 5094
geneUniverse <- rownames(se.paired.genes)
params <- new("KEGGHyperGParams", geneIds=DEgenes, universeGeneIds=geneUniverse,
              annotation="org.Hs.eg.db", pvalueCutoff=0.01, testDirection="over")

hgOver <- hyperGTest(params)

hgOver
## Gene to KEGG  test for over-representation 
## 220 KEGG ids tested (6 have p < 0.01)
## Selected gene set size: 1417 
##     Gene universe size: 3489 
##     Annotation package: org.Hs.eg
KEGGresults <- summary(hgOver)

# Filtering by counts and size
KEGGresults <- KEGGresults[KEGGresults$Size >= 5 & KEGGresults$Count >= 5, ]


KEGGresults <- KEGGresults[order(KEGGresults$OddsRatio, decreasing = TRUE), ]
KEGGresults <- KEGGresults[order(KEGGresults$Pvalue, decreasing = FALSE), ]

head(KEGGresults)
##   KEGGID       Pvalue OddsRatio ExpCount Count Size
## 1  05340 2.607983e-05  8.894925  8.51660    18   21
## 2  05014 5.960429e-05  4.879074 12.16657    23   30
## 3  05100 2.362078e-04  2.812164 21.08872    34   52
## 4  05416 3.336848e-03  2.959198 12.16657    20   30
## 5  04115 6.072972e-03  2.117623 20.68317    30   51
## 6  04660 8.238377e-03  1.957110 23.52204    33   58
##                                     Term
## 1               Primary immunodeficiency
## 2    Amyotrophic lateral sclerosis (ALS)
## 3 Bacterial invasion of epithelial cells
## 4                      Viral myocarditis
## 5                  p53 signaling pathway
## 6      T cell receptor signaling pathway
htmlReport(hgOver, file = "keggover.html")

Over represented KEGGs (Under expressed)

DEgenes <- rownames(under_exp.all)[under_exp.all$adj.P.Val < 0.01]
length(DEgenes)
## [1] 4014
geneUniverse <- rownames(se.paired.genes)
params <- new("KEGGHyperGParams", geneIds=DEgenes, universeGeneIds=geneUniverse,
              annotation="org.Hs.eg.db", pvalueCutoff=0.01, testDirection="over")

hgOver <- hyperGTest(params)

hgOver
## Gene to KEGG  test for over-representation 
## 223 KEGG ids tested (11 have p < 0.01)
## Selected gene set size: 1160 
##     Gene universe size: 3486 
##     Annotation package: org.Hs.eg
KEGGresults <- summary(hgOver)

# Filtering by counts and size
KEGGresults <- KEGGresults[KEGGresults$Size >= 5 & KEGGresults$Count >= 5, ]


KEGGresults <- KEGGresults[order(KEGGresults$OddsRatio, decreasing = TRUE), ]
KEGGresults <- KEGGresults[order(KEGGresults$Pvalue, decreasing = FALSE), ]

head(KEGGresults)
##   KEGGID       Pvalue OddsRatio  ExpCount Count Size
## 1  00190 8.727410e-08  3.947959  21.24785    42   64
## 2  01100 2.043796e-05  1.457041 212.81053   258  641
## 3  05012 2.437619e-04  2.599276  19.58786    33   59
## 4  00280 3.122433e-04  3.567603  10.95592    21   33
## 5  00071 3.122433e-04  3.567603  10.95592    21   33
## 6  00640 8.080144e-04  4.354959   7.30395    15   22
##                                         Term
## 1                  Oxidative phosphorylation
## 2                         Metabolic pathways
## 3                        Parkinson's disease
## 4 Valine, leucine and isoleucine degradation
## 5                     Fatty acid degradation
## 6                      Propanoate metabolism
htmlReport(hgOver, file = "keggunder.html")

When performing the functional enrichment analysis, some fo the gene sets have an OddsRatio value of Infinite. Which is the biological meaning of this?

NAIVE BAYES CLASSIFIER

Then, we performed a Hierarchical clustering on the set of paired samples used for the analysis and all the available samples, but only the 199 D.E. genes in both cases. This should give us a picture of how good these genes are for separating tumor and normal samples.

head(colnames(colData(se.paired.genes)))
## [1] "type"                     "bcr_patient_uuid"        
## [3] "bcr_patient_barcode"      "form_completion_date"    
## [5] "prospective_collection"   "retrospective_collection"
# Get a mask with DE genes to filter SE object
DE.genes <-  rownames(se.paired.genes) %in% rownames(over_exp) | 
             rownames(se.paired.genes) %in% rownames(under_exp)

# Filter SE object using DE.genes (only paired samples)
se.DE  <- se.paired.genes[DE.genes,]
dgenorm.DE <- dgenorm[DE.genes,]

# Filter SE object using DE.genes (all samples except paired)

se.all.DE <- se[mask,]            # ony high expressed genes
se.all.DE <- se.all.DE[DE.genes]  # only DE genes
se.all.DE <- se.all.DE[,!substr(colnames(se.filt), 9, 12) %in% common.codes]

dgenorm.all.DE <- dgenorm[DE.genes,]  # only DE genes
#logCPM <- cpm(dgenorm, log=TRUE, prior.count=3)
d <- as.dist(1-cor(assays(se.DE)$logCPMnorm, method="spearman"))
sampleClustering <- hclust(d)
batch <- as.integer(factor(se.DE$type))
sampleDendrogram <- as.dendrogram(sampleClustering, hang=0.1)
names(batch) <- colnames(se.DE)
outcome <- as.character(se.DE$type)
names(outcome) <- colnames(se.DE)
sampleDendrogram <- dendrapply(sampleDendrogram,
                               function(x, batch, labels) {
                                 if (is.leaf(x)) {
                                   attr(x, "nodePar") <- list(lab.col=as.vector(batch[attr(x, "label")]))
                                   attr(x, "label") <- as.vector(labels[attr(x, "label")])
                                 }
                                 x
                               }, batch, outcome)
plot(sampleDendrogram, main="Hierarchical clustering of samples")
legend("topright", paste("TYPE", sort(unique(batch)), levels(factor(se.DE$type))), fill=sort(unique(batch)))
Figure S23: Hierarchical clustering of the samples used in the analysis using only the DE genes.

Figure S23: Hierarchical clustering of the samples used in the analysis using only the DE genes.

#logCPM <- cpm(dgenorm, log=TRUE, prior.count=3)
d <- as.dist(1-cor(assays(se.all.DE)$logCPM, method="spearman"))
sampleClustering <- hclust(d)
batch <- as.integer(factor(se.all.DE$type))
sampleDendrogram <- as.dendrogram(sampleClustering, hang=0.1)
names(batch) <- colnames(se.all.DE)
outcome <- as.character(se.all.DE$type)
names(outcome) <- colnames(se.all.DE)
sampleDendrogram <- dendrapply(sampleDendrogram,
function(x, batch, labels) {
if (is.leaf(x)) {
attr(x, "nodePar") <- list(lab.col=as.vector(batch[attr(x, "label")]))
attr(x, "label") <- as.vector(labels[attr(x, "label")])
}
x
}, batch, outcome)
plot(sampleDendrogram, main="Hierarchical clustering of samples")
legend("topright", paste("TYPE", sort(unique(batch)), levels(factor(se.all.DE$type))), fill=sort(unique(batch)))
Figure S24: Hierarchical clustering of all samples using only the DE genes.

Figure S24: Hierarchical clustering of all samples using only the DE genes.

Building and evaluating the model

se.training <- as.data.frame(t(assay(se.DE)))
se.training$type <- colData(se.DE)$type

se.testing <- as.data.frame(t(assay(se.all.DE)))
se.testing$type <- colData(se.all.DE)$type

# Remove training samples from testing set
se.testing <- se.testing[!rownames(se.testing) %in% rownames(se.training),]

# Build model and predict
model <- naiveBayes(type ~., data=se.training)
nb.res <- table(predict(model, se.testing), se.testing[,200])

TP <- nb.res[2,2]
TN <- nb.res[1,1]
FP <- nb.res[2,1]
FN <- nb.res[1,2]

precision   <- TP / (TP + FP) 
sensitivity <- TP / (TP + FN)
specificity <- TN / (TN + FP) 
specificity <-
pred.results <- data.frame(precision, specificity, sensitivity)
pred.results
##   precision specificity sensitivity
## 1  0.997416   0.9655172   0.9212411

SESSION INFORMATION

sessionInfo()
## R version 3.2.3 (2015-12-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04 LTS
## 
## locale:
##  [1] LC_CTYPE=es_ES.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=es_ES.UTF-8        LC_COLLATE=es_ES.UTF-8    
##  [5] LC_MONETARY=es_ES.UTF-8    LC_MESSAGES=es_ES.UTF-8   
##  [7] LC_PAPER=es_ES.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=es_ES.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] xtable_1.8-2               GSVAdata_1.6.0            
##  [3] hgu95a.db_3.2.2            GSEABase_1.32.0           
##  [5] e1071_1.6-7                KEGG.db_3.2.2             
##  [7] biomaRt_2.26.1             GOstats_2.36.0            
##  [9] graph_1.48.0               Category_2.36.0           
## [11] GO.db_3.2.2                Matrix_1.2-6              
## [13] org.Hs.eg.db_3.2.3         RSQLite_1.0.0             
## [15] DBI_0.4-1                  geneplotter_1.48.0        
## [17] annotate_1.48.0            XML_3.98-1.3              
## [19] AnnotationDbi_1.32.3       lattice_0.20-33           
## [21] ggplot2_2.1.0              sva_3.18.0                
## [23] genefilter_1.52.1          mgcv_1.8-12               
## [25] nlme_3.1-128               edgeR_3.12.1              
## [27] limma_3.26.9               SummarizedExperiment_1.0.2
## [29] Biobase_2.30.0             GenomicRanges_1.22.4      
## [31] GenomeInfoDb_1.6.3         IRanges_2.4.8             
## [33] S4Vectors_0.8.11           BiocGenerics_0.16.1       
## 
## loaded via a namespace (and not attached):
##  [1] splines_3.2.3          colorspace_1.2-6       htmltools_0.3.5       
##  [4] survival_2.39-4        RBGL_1.46.0            RColorBrewer_1.1-2    
##  [7] plyr_1.8.3             stringr_1.0.0          zlibbioc_1.16.0       
## [10] munsell_0.4.3          gtable_0.2.0           evaluate_0.9          
## [13] labeling_0.3           knitr_1.13             class_7.3-14          
## [16] Rcpp_0.12.5            KernSmooth_2.23-15     scales_0.4.0          
## [19] formatR_1.4            XVector_0.10.0         digest_0.6.9          
## [22] stringi_1.1.1          grid_3.2.3             tools_3.2.3           
## [25] bitops_1.0-6           magrittr_1.5           RCurl_1.95-4.8        
## [28] rmarkdown_0.9.6        AnnotationForge_1.12.2